The following code visualize the IMDB dataset of indian movies. The code is divided into four sections:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import collections
from collections import namedtuple
import pandas_profiling
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')
# Read Data
df = pd.read_csv("indian_movies.csv",encoding="ISO-8859-1")
df.head()
| Name | Year | Duration | Genre | Rating | Votes | Director | Actor 1 | Actor 2 | Actor 3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | Drama | NaN | NaN | J.S. Randhawa | Manmauji | Birbal | Rajendra Bhatia | |
| 1 | #Gadhvi (He thought he was Gandhi) | (2019) | 109 min | Drama | 7.0 | 8 | Gaurav Bakshi | Rasika Dugal | Vivek Ghamande | Arvind Jangid |
| 2 | #Homecoming | (2021) | 90 min | Drama, Musical | NaN | NaN | Soumyajit Majumdar | Sayani Gupta | Plabita Borthakur | Roy Angana |
| 3 | #Yaaram | (2019) | 110 min | Comedy, Romance | 4.4 | 35 | Ovais Khan | Prateik | Ishita Raj | Siddhant Kapoor |
| 4 | ...And Once Again | (2010) | 105 min | Drama | NaN | NaN | Amol Palekar | Rajat Kapoor | Rituparna Sengupta | Antara Mali |
# Analysing Data
report = pandas_profiling.ProfileReport(df)
report.to_file(output_file='profilingReport.html')
After looking at the Pandas Profiling Report we come to a following conclusion:
# Missing Values
null_rows = df[df.iloc[: , 1:9].isna().apply(lambda x: all(x), axis=1)]
print("Missing values from each column:")
null_rows.head()
Missing values from each column:
| Name | Year | Duration | Genre | Rating | Votes | Director | Actor 1 | Actor 2 | Actor 3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1836 | Bang Bang Reloaded | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1920 | Battle of bittora | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2653 | Campus | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3403 | Dancing Dad | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3807 | Dial 100 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df = df[~df.iloc[: , 1:9].isna().apply(lambda x: all(x), axis=1)]
df.shape
(15501, 10)
# Duplicate Values
duplicate = df[df.duplicated(subset = ['Name', 'Year'], keep = False)]
print("Duplicate rows according to Name and Year:")
duplicate.head()
Duplicate rows according to Name and Year:
| Name | Year | Duration | Genre | Rating | Votes | Director | Actor 1 | Actor 2 | Actor 3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 135 | A Ballad of Maladies | (2016) | 86 min | Documentary, Music | NaN | NaN | Sarvnik Kaur | Tushar Madhav | NaN | NaN |
| 136 | A Ballad of Maladies | (2016) | 86 min | Documentary, History, Music | NaN | NaN | Sarvnik Kaur | Tushar Madhav | NaN | NaN |
| 585 | Achanak | (1988) | NaN | Thriller | NaN | NaN | Rakesh Kashyap | Om Puri | Kunickaa Sadanand | Kanwaljit Singh |
| 586 | Achanak | (1988) | NaN | Drama | NaN | NaN | Kavi Raz | Alka | Amarjeet Kaur | Kavi Raz |
| 953 | Amrit | (1941) | 153 min | NaN | NaN | NaN | Master Vinayak | Dada Salvi | Baburao Pendharkar | Lalita Pawar |
# Droping Duplicates
df.drop_duplicates(subset=['Name', 'Year'], inplace=True)
df.shape
(15485, 10)
null_rows = df[df.iloc[: , [1,2,4,5]].isna().apply(lambda x: all(x), axis=1)]
df = df[~df.iloc[: , [1,2,4,5]].isna().apply(lambda x: all(x), axis=1)]
df['Year'] = df['Year'].str.replace(r'(', '').str.replace(r')', '') # Removing brackets from Year
df['Duration'] = df['Duration'].str.replace(r' min', '') # Removing min from Duration
df.drop(df.loc[df['Year']=='2022'].index, inplace = True) # Removing 2022 movies data
df.head()
| Name | Year | Duration | Genre | Rating | Votes | Director | Actor 1 | Actor 2 | Actor 3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | #Gadhvi (He thought he was Gandhi) | 2019 | 109 | Drama | 7.0 | 8 | Gaurav Bakshi | Rasika Dugal | Vivek Ghamande | Arvind Jangid |
| 2 | #Homecoming | 2021 | 90 | Drama, Musical | NaN | NaN | Soumyajit Majumdar | Sayani Gupta | Plabita Borthakur | Roy Angana |
| 3 | #Yaaram | 2019 | 110 | Comedy, Romance | 4.4 | 35 | Ovais Khan | Prateik | Ishita Raj | Siddhant Kapoor |
| 4 | ...And Once Again | 2010 | 105 | Drama | NaN | NaN | Amol Palekar | Rajat Kapoor | Rituparna Sengupta | Antara Mali |
| 5 | ...Aur Pyaar Ho Gaya | 1997 | 147 | Comedy, Drama, Musical | 4.7 | 827 | Rahul Rawail | Bobby Deol | Aishwarya Rai Bachchan | Shammi Kapoor |
Plotting various graphs to visualize the data throughout the years.
year_count=df.groupby('Year').apply(lambda x:x['Name'].count()).reset_index(name='Count')
fig = px.bar(year_count, y='Count', x='Year', text='Count',title='Number of movies by year of launch')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_mode='show', xaxis = dict (title = 'Year of Movie Release', titlefont_size=16), yaxis=dict(title='Count of Movies Released', titlefont_size=16,tickfont_size=14))
fig.show()
dummies = df['Genre'].str.get_dummies(sep=', ')
df_genre = pd.concat([df, dummies], axis=1)
df_genre = df_genre.replace(0, np.nan)
df_genre.drop(['Name','Year','Duration','Genre','Rating','Votes','Director','Actor 1','Actor 2','Actor 3'], axis=1).sum().sort_values(ascending=False)
Drama 7064.0 Action 3479.0 Romance 2399.0 Comedy 1955.0 Thriller 1643.0 Crime 1290.0 Family 930.0 Musical 580.0 Adventure 524.0 Mystery 501.0 Horror 499.0 Fantasy 448.0 Documentary 377.0 History 195.0 Biography 192.0 Animation 120.0 Music 85.0 Sport 66.0 Sci-Fi 55.0 War 43.0 News 9.0 Western 5.0 Reality-TV 3.0 Short 1.0 dtype: float64
df_genre_count = df_genre.drop(['Name','Duration','Genre','Rating','Votes','Director','Actor 1','Actor 2','Actor 3'], axis=1)
df_genre_count = df_genre_count.groupby('Year').sum()
df_genre_count.reset_index(level=0, inplace=True)
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Action,
mode='lines',
name='Action',
marker_color = '#2E91E5'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Adventure,
mode='lines',
name='Adventure',
marker_color = '#E15F99'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Comedy,
mode='lines',
name='Comedy',
marker_color = '#DA16FF'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Drama,
mode='lines',
name='Drama',
marker_color = '#750D86'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Horror,
mode='lines',
name='Horror',
marker_color = '#FB00D1'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Romance,
mode='lines',
name='Romance',
marker_color = '#A777F1'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Thriller,
mode='lines',
name='Thriller',
marker_color = '#6C4516'))
fig.update_layout(
title='Genre through the Years',
xaxis_tickfont_size=14,
height = 800,
yaxis=dict(
title='Count',
titlefont_size=16,
tickfont_size=14,
),
legend=dict(
y=0,
x=1.0,
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'
))
fig.show()
Actor1 = df[['Actor 1','Year']].rename(columns = {'Actor 1': 'Actor'}, inplace = False)
Actor2 = df[['Actor 2','Year']].rename(columns = {'Actor 2': 'Actor'}, inplace = False)
Actor3 = df[['Actor 3','Year']].rename(columns = {'Actor 3': 'Actor'}, inplace = False)
Actor_Year = pd.concat([Actor1, Actor2, Actor3], ignore_index=True).dropna()
Actor_Year['Count'] = 1
Actor_Top = Actor_Year['Actor'].value_counts().rename_axis('Actor').reset_index(name='Count')
fig = px.bar(Actor_Top[0:20],
x='Actor',
y='Count',
title = 'Top 20 actors by number of Movies made')
fig.show()
Top_Actor = Actor_Top[0:20]['Actor']
Actor_Year = Actor_Year[Actor_Year['Actor'].isin(Top_Actor)].set_index('Actor').loc[Top_Actor].reset_index(level=0)
Actor_Year = Actor_Year.sort_values('Year')
Actor_Year['Year'] = pd.to_datetime(Actor_Year['Year'])
fig = px.strip(Actor_Year,
x="Year",
y='Actor',
color = "Actor",
color_discrete_sequence=px.colors.qualitative.Dark24,
height = 600,
title = 'Top 20 actors by number of Movies made through the Years')
fig.show()